In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
In [2]:
df = pd.read_csv(r"D:\Stuff\Data Science\Machine Learning\bank-additional-full.csv" , na_values='unknown' , sep=';')
print(df.shape)
df.head()
(41188, 21)
Out[2]:
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 1 | 57 | services | married | high.school | NaN | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 2 | 37 | services | married | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 3 | 40 | admin. | married | basic.6y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 4 | 56 | services | married | high.school | no | no | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
5 rows × 21 columns
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 41188 entries, 0 to 41187 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 41188 non-null int64 1 job 40858 non-null object 2 marital 41108 non-null object 3 education 39457 non-null object 4 default 32591 non-null object 5 housing 40198 non-null object 6 loan 40198 non-null object 7 contact 41188 non-null object 8 month 41188 non-null object 9 day_of_week 41188 non-null object 10 duration 41188 non-null int64 11 campaign 41188 non-null int64 12 pdays 41188 non-null int64 13 previous 41188 non-null int64 14 poutcome 41188 non-null object 15 emp.var.rate 41188 non-null float64 16 cons.price.idx 41188 non-null float64 17 cons.conf.idx 41188 non-null float64 18 euribor3m 41188 non-null float64 19 nr.employed 41188 non-null float64 20 y 41188 non-null object dtypes: float64(5), int64(5), object(11) memory usage: 6.6+ MB
In [4]:
df.isna().sum()
Out[4]:
age 0 job 330 marital 80 education 1731 default 8597 housing 990 loan 990 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp.var.rate 0 cons.price.idx 0 cons.conf.idx 0 euribor3m 0 nr.employed 0 y 0 dtype: int64
In [5]:
na_cols = df.columns[df.isnull().any()]
na_dtypes = df[na_cols].dtypes
print(na_dtypes)
job object marital object education object default object housing object loan object dtype: object
In [6]:
df.describe().round().T
Out[6]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| age | 41188.0 | 40.0 | 10.0 | 17.0 | 32.0 | 38.0 | 47.0 | 98.0 |
| duration | 41188.0 | 258.0 | 259.0 | 0.0 | 102.0 | 180.0 | 319.0 | 4918.0 |
| campaign | 41188.0 | 3.0 | 3.0 | 1.0 | 1.0 | 2.0 | 3.0 | 56.0 |
| pdays | 41188.0 | 962.0 | 187.0 | 0.0 | 999.0 | 999.0 | 999.0 | 999.0 |
| previous | 41188.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7.0 |
| emp.var.rate | 41188.0 | 0.0 | 2.0 | -3.0 | -2.0 | 1.0 | 1.0 | 1.0 |
| cons.price.idx | 41188.0 | 94.0 | 1.0 | 92.0 | 93.0 | 94.0 | 94.0 | 95.0 |
| cons.conf.idx | 41188.0 | -41.0 | 5.0 | -51.0 | -43.0 | -42.0 | -36.0 | -27.0 |
| euribor3m | 41188.0 | 4.0 | 2.0 | 1.0 | 1.0 | 5.0 | 5.0 | 5.0 |
| nr.employed | 41188.0 | 5167.0 | 72.0 | 4964.0 | 5099.0 | 5191.0 | 5228.0 | 5228.0 |
In [7]:
# List of columns having missing values
na_cols_list = na_cols.tolist()
# Loop through each column and fill missing values with the mode of the column
for col in na_cols_list:
df[col] = df[col].fillna(df[col].mode().values[0])
# Check if there are any remaining missing values
print(df.isna().sum())
age 0 job 0 marital 0 education 0 default 0 housing 0 loan 0 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp.var.rate 0 cons.price.idx 0 cons.conf.idx 0 euribor3m 0 nr.employed 0 y 0 dtype: int64
In [8]:
plt.figure(figsize=(13,3))
sns.set_style("darkgrid")
plt.title("Detected Outliers")
sns.boxplot(x='age', data=df)
plt.show()
In [9]:
q1 = df['age'].quantile(0.25)
q3 = df['age'].quantile(0.75)
iqr = q3-q1
df = df[(df['age'] > q1 -1.5 * iqr ) & (df['age'] < q3 + 1.5 * iqr )]
In [10]:
fig = px.box(df, x='duration')
fig.update_traces(marker=dict(color='#FF851B'))
fig.update_layout(autosize=False, width=1100, height= 400, title='Detected Outliers')
fig.show()
In [11]:
Q1 = df['duration'].quantile(0.25)
Q3 = df['duration'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR
df = df[(df['duration']>Q1 - 1.5 * IQR) & (df['duration']<Q3 + 1.5 * IQR)]
df = df[df['duration']<490]
In [12]:
fig = px.box(df, x='duration')
fig.update_traces(marker=dict(color='#FF851B'))
fig.update_layout(autosize=False, width=1100, height=400, title='Detected Outliers')
fig.show()
In [13]:
def pie_plot(ax, col, df, title="Pie Chart"):
value_counts = df[col].value_counts()
values = list(value_counts.values)
labels = value_counts.index
colors = sns.color_palette('pastel')[:len(labels)]
ax.pie(values, labels=labels, autopct='%.1f%%', colors=colors,startangle=140)
ax.set_title(title)
columns = ['job', 'education', 'loan', 'month', 'day_of_week', 'contact']
titles = ['Distribution of Jobs', 'Distribution of Education', 'Distribution of Loan',
'Distribution of Month', 'Distribution of Day of Week', 'Distribution of Contact']
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()
for i, (col, title) in enumerate(zip(columns, titles)):
pie_plot(axes[i], col, df, title=title)
plt.tight_layout()
plt.show()
In [14]:
numeric_columns = df.select_dtypes(include=['int64', 'float64'])
numeric_columns_list = numeric_columns.columns.tolist()
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()
for i, (col, title) in enumerate(zip(numeric_columns_list, titles)):
sns.histplot(df[col], kde=True, bins=10, ax=axes[i])
axes[i].set_title(f"{col} distribution" )
axes[i].tick_params(axis='x', rotation=45)
if df[col].dtype in ['float64', 'int64']:
mean_value = df[col].mean()
axes[i].axvline(mean_value, ls='--', color='red')
# Get the maximum y limit of the plot
max_y = axes[i].get_ylim()[1]
# Text placement with dynamic X and Y
axes[i].text(mean_value, max_y * 0.85, f'<-- Mean: {mean_value:.2f}', color='black')
plt.tight_layout()
plt.show()
In [15]:
plt.figure(figsize=(12,4))
ax = sns.countplot(data=df,x='job',hue='job',order = df['job'].value_counts().index, palette='viridis')
plt.xticks(rotation = 45)
for p in ax.patches:
ax.annotate(f'{p.get_height()}',
(p.get_x() + p.get_width() / 2.,p.get_height()),
ha = 'center',va='center',
fontsize=10,color='black',
xytext=(0,5),textcoords='offset points')
plt.title("Job Distribution")
plt.show()
In [16]:
plt.figure(figsize=(5,3))
ax = sns.countplot(x='y', data=df,hue='y', order=df['y'].value_counts().index, palette='viridis')
plt.xticks(rotation=45)
for p in ax.patches:
ax.annotate(f'{p.get_height()}',
(p.get_x() + p.get_width() / 2.,p.get_height()),
ha = 'center',va='center',
fontsize=10,color='black',
xytext=(0,5),textcoords='offset points')
plt.title("Distribution of Y")
plt.show()
In [17]:
df1= df.copy()
df1['y'] = le.fit_transform(df1['y'])
In [18]:
df1.dtypes
Out[18]:
age int64 job object marital object education object default object housing object loan object contact object month object day_of_week object duration int64 campaign int64 pdays int64 previous int64 poutcome object emp.var.rate float64 cons.price.idx float64 cons.conf.idx float64 euribor3m float64 nr.employed float64 y int32 dtype: object
In [19]:
categorical_variables = df1.select_dtypes(include='object').columns.tolist()
for column in categorical_variables:
df1[column] = le.fit_transform(df1[column])
df1.dtypes
Out[19]:
age int64 job int32 marital int32 education int32 default int32 housing int32 loan int32 contact int32 month int32 day_of_week int32 duration int64 campaign int64 pdays int64 previous int64 poutcome int32 emp.var.rate float64 cons.price.idx float64 cons.conf.idx float64 euribor3m float64 nr.employed float64 y int32 dtype: object
In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = df1.drop('y', axis=1)
y = df1['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)) )
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))
In [22]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
In [23]:
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.2)
Epoch 1/10 714/714 ━━━━━━━━━━━━━━━━━━━━ 10s 8ms/step - accuracy: 0.8071 - loss: 0.3860 - val_accuracy: 0.9439 - val_loss: 0.1455 Epoch 2/10 714/714 ━━━━━━━━━━━━━━━━━━━━ 5s 7ms/step - accuracy: 0.9483 - loss: 0.1351 - val_accuracy: 0.9476 - val_loss: 0.1403 Epoch 3/10 714/714 ━━━━━━━━━━━━━━━━━━━━ 5s 8ms/step - accuracy: 0.9486 - loss: 0.1335 - val_accuracy: 0.9494 - val_loss: 0.1374 Epoch 4/10 714/714 ━━━━━━━━━━━━━━━━━━━━ 5s 7ms/step - accuracy: 0.9517 - loss: 0.1260 - val_accuracy: 0.9478 - val_loss: 0.1367 Epoch 5/10 714/714 ━━━━━━━━━━━━━━━━━━━━ 5s 7ms/step - accuracy: 0.9524 - loss: 0.1234 - val_accuracy: 0.9502 - val_loss: 0.1345 Epoch 6/10 714/714 ━━━━━━━━━━━━━━━━━━━━ 4s 6ms/step - accuracy: 0.9544 - loss: 0.1150 - val_accuracy: 0.9501 - val_loss: 0.1330 Epoch 7/10 714/714 ━━━━━━━━━━━━━━━━━━━━ 5s 7ms/step - accuracy: 0.9516 - loss: 0.1185 - val_accuracy: 0.9494 - val_loss: 0.1324 Epoch 8/10 714/714 ━━━━━━━━━━━━━━━━━━━━ 5s 6ms/step - accuracy: 0.9509 - loss: 0.1222 - val_accuracy: 0.9504 - val_loss: 0.1321 Epoch 9/10 714/714 ━━━━━━━━━━━━━━━━━━━━ 6s 8ms/step - accuracy: 0.9536 - loss: 0.1138 - val_accuracy: 0.9502 - val_loss: 0.1300 Epoch 10/10 714/714 ━━━━━━━━━━━━━━━━━━━━ 6s 9ms/step - accuracy: 0.9518 - loss: 0.1181 - val_accuracy: 0.9508 - val_loss: 0.1295
In [24]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy * 100:.2f} %")
223/223 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - accuracy: 0.9467 - loss: 0.1372 Test Loss: 0.1230, Test Accuracy: 94.98 %
In [25]:
plt.figure(figsize=(12,5))
plt.plot(history.history['accuracy'], label='Train Data Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Data Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()